/*****************************************************************************************************************************************
******Code for Currie, Mueller-Smith, Rossin-Slater "Violence while in Utero: The Impacts of Assaults During Pregnancy on Birth Outcomes**
******July 2020***********************************************************************************************************************/


**** THIS DO-FILE CLEANS THE BIRTH OUTCOMES DATA TO PREPARE TO MERGE IT WITH MATERNAL ADDRESS CHARACTERISTICS AND CRIME DATA ************


set more off
set matsize 10000

clear all

cap log close

***** Paths to data set locations, results graphs *****
global bulk 
global home 
global results 
global graphs 

**** Load in births data

use ${bulk}birth_outcomes_chars.dta, clear

*before we proceed, we want to drop duplicate observations 
*these are not captured in the Study_ID variable (it is unique for every entry in the data)

*create a new ID variable based on all variables except Study_ID

*list all variables beside Study_ID
ds Study_ID, not

egen new_id = group(`r(varlist)'), missing

codebook new_id

count

duplicates report new_id

duplicates report new_id if dobyyyy==2006
duplicates report new_id if dobyyyy!=2006

duplicates drop new_id, force

count 

tab dobyyyy

drop new_id


***** DATA CLEANING *****

ren dobyyyy birth_year
ren dobmm birth_month

label var birth_year "Birth Year"
label var birth_month "Birth Month"

*we have ~50 observations with birth year before 1993
*these are late filers and errors, we drop them
drop if birth_year<1993

***group birth years according to how data was read in
gen source_years = 1 if birth_year<=1994
replace source_years = 2 if birth_year>=1995 & birth_year<=1998
replace source_years = 3 if birth_year>=1999 & birth_year<=2007
replace source_years = 4 if birth_year>=2008

label define source 1 "1993-1994" 2 "1995-1998" 3 "1999-2007" 4 "2008-2013"
label values source_years source

tab source_years

label var source_years "Data Years"


***conception year / month

*clean up gestation first
*note: gestcal is in days, not weeks
*gestcal is missing for over 3% of observations, while gestest is not missing as often
*we use gestest to assign gestation month

replace gestest = . if gestest<=0
replace gestest = . if gestest==99
replace gestest = . if gestest<20
*drop the very few observations with less than 20 weeks gestation
replace gestest = . if gestest>45
*drop the very few observations with more than 45 weeks gestation
gen gest_months = gestest/4.5

label var gest_months "Gestation in Months"

ren gestest gest_weeks 
label var gest_weeks "Gestation in Weeks"



*calculate conception year/month
gen birth_ym = ym(birth_year, birth_month)

gen concep_ym = birth_ym - gest_months

replace concep_ym = floor(concep_ym)
format concep_ym %tm

label var concep_ym "Conception Year/Month"

gen concep_year = year(dofm(concep_ym))
gen concep_month = month(dofm(concep_ym))

label var concep_year "Conception Year"
label var concep_month "Conception Month"

tab concep_year
tab concep_month

***clean up other variables

***child sex
*has an extra digit in 1993
tostring sex, replace
gen sex2 = substr(sex, 1, 1)
destring sex2, replace

drop sex
ren sex2 sex

gen cmale = (sex==1) if sex<=2

label var cmale "Child is Male"

***birth at a hospital
gen birth_hosp = TYPE_PLACE==1 if TYPE_PLACE<7
replace birth_hosp = TYPE_PLACE2==1 if birth_hosp==. & TYPE_PLACE2<9

label var birth_hosp "Birth at Hospital"

***mother's age
ren M_AGE momage
replace momage = . if momage==99
replace momage = . if momage>55

label var momage "Mother's Age"

gen momage1 = momage<20
gen momage2 = momage>=20 & momage<25
gen momage3 = momage>=25 & momage<35
gen momage4 = momage>=35 & momage<.
gen momagemiss = momage==.

label var momage1 "Mother's Age less than 20"
label var momage2 "Mother's Age 20-24"
label var momage3 "Mother's Age 25-34"
label var momage4 "Mother's Age 35 or more"
label var momagemiss "Mother's Age Missing"

***mother married
gen mom_married = marital==2 if marital<9

label var mom_married "Mother Married"

***mother foreign born
ren M_FOREIGN mom_foreign
replace mom_foreign = . if mom_foreign==99

label var mom_foreign "Mother Foreign-Born"


***mother's borough of residence
gen mom_borough = borores
replace mom_borough = . if borores>6
**** note: value=6 means outside NYC (but gave birth in NYC)

label var mom_borough "Mother's Borough of Residence"

***father's age
ren F_AGE dadage
replace dadage = . if dadage==99

label var dadage "Father's Age"

gen dadage1 = dadage<20 
gen dadage2 = dadage>=20 & dadage<25 
gen dadage3 = dadage>=25 & dadage<35 
gen dadage4 = dadage>=35 & dadage<.
gen dadagemiss = dadage==.


label var dadage1 "Father's Age less than 20"
label var dadage2 "Father's Age 20-24"
label var dadage3 "Father's Age 25-34"
label var dadage4 "Father's Age 35 or more"
label var dadagemiss "Father's Age Missing"


***father foreign born
ren F_FOREIGN dad_foreign
replace dad_foreign = . if dad_foreign==99

label var dad_foreign "Father Foreign-Born"

***mother's race/ethnicity: need to harmonize data from before/after 2008

*hispanic/non-hispanic:

gen momhispanic = 0
replace momhispanic = 1 if M_ETHNI==1 | M_ETHNI==2
*2008+ data
replace momhispanic = 1 if M_ETHNI2==1 | M_ETHNI2==2
*<2008 data

*check:
table birth_year, contents(mean momhispanic)

gen momnonhispanicwhite = 0
replace momnonhispanicwhite = 1 if M_RACE==1 & momhispanic==0
*2008+ data
replace momnonhispanicwhite = 1 if M_RACE2=="1" & momhispanic==0
*<2008 data

gen momnonhispanicblack = 0
replace momnonhispanicblack = 1 if M_RACE==2 & momhispanic==0
*2008+ data
replace momnonhispanicblack = 1 if M_RACE2=="2" & momhispanic==0
*<2008 data

gen momasian = 0
replace momasian = 1 if M_RACE==4 & momhispanic==0
*2008+ data
replace momasian = 1 if M_RACE2=="A"|M_RACE2=="B"|M_RACE2=="D"|M_RACE2=="4"|M_RACE2=="5"|M_RACE2=="8"
*asian indian, korean, vietnamese, chinese, japanese, filipino
*<2008 data

gen momotherrace = 0
replace momotherrace = 1 if momhispanic==0 & momnonhispanicblack==0 & momnonhispanicwhite==0 & momasian==0
replace momotherrace = 0 if M_ETHNI==9 | M_ETHNI2==9
replace momotherrace = 0 if M_ETHNI==. & birth_year>=2008
replace momotherrace = 0 if M_ETHNI2==. & birth_year<2008
*missing race/ethnicity

gen mommissingrace = 0
replace mommissingrace = 1 if M_ETHNI==9 | M_ETHNI2==9
replace mommissingrace = 1 if M_ETHNI==. & birth_year>=2008
replace mommissingrace = 1 if M_ETHNI2==. & birth_year<2008

*check:
table birth_year, contents(mean momhispanic mean momnonhispanicwhite mean momnonhispanicblack mean momotherrace mean momasian)

label var momhispanic "Mother Hispanic"
label var momnonhispanicwhite "Mother Non-Hispanic White"
label var momnonhispanicblack "Mother Non-Hispanic Black"
label var momasian "Mother Non-Hispanic Asian"
label var momotherrace "Mother Other Race"
label var mommissingrace "Mother Missing Race"



***father's race/ethnicity

gen dadhispanic = 0
replace dadhispanic = 1 if F_ETHNI==1 | F_ETHNI==2
*2008+ data
replace dadhispanic = 1 if F_ETHNI2==1 | F_ETHNI2==2
*<2008 data

*check:
table birth_year, contents(mean dadhispanic)

gen dadnonhispanicwhite = 0
replace dadnonhispanicwhite = 1 if F_RACE==1 & dadhispanic==0
*2008+ data
replace dadnonhispanicwhite = 1 if F_RACE2=="1" & dadhispanic==0
*<2008 data

gen dadnonhispanicblack = 0
replace dadnonhispanicblack = 1 if F_RACE==2 & dadhispanic==0
*2008+ data
replace dadnonhispanicblack = 1 if F_RACE2=="2" & dadhispanic==0
*<2008 data

gen dadasian = 0
replace dadasian = 1 if F_RACE==4 & dadhispanic==0
*2008+ data
replace dadasian = 1 if F_RACE2=="A"|F_RACE2=="B"|F_RACE2=="D"|F_RACE2=="4"|F_RACE2=="5"|F_RACE2=="8"
*asian indian, korean, vietnamese, chinese, japanese, filipino
*<2008 data


gen dadotherrace = 0
replace dadotherrace = 1 if dadhispanic==0 & dadnonhispanicblack==0 & dadnonhispanicwhite==0 & dadasian==0
replace dadotherrace = 0 if F_ETHNI==. & birth_year>=2008
replace dadotherrace = 0 if F_ETHNI2==. & birth_year<2008
*missing race/ethnicity

gen dadmissingrace = 0
replace dadmissingrace = 1 if F_ETHNI==9 | F_ETHNI2==9
replace dadmissingrace = 1 if F_ETHNI==. & birth_year>=2008
replace dadmissingrace = 1 if F_ETHNI2==. & birth_year<2008


*check:
table birth_year, contents(mean dadhispanic mean dadnonhispanicwhite mean dadnonhispanicblack mean dadasian)

label var dadhispanic "Father Hispanic"
label var dadnonhispanicwhite "Father Non-Hispanic White"
label var dadnonhispanicblack "Father Non-Hispanic Black"
label var dadasian "Father Non-Hispanic Asian"
label var dadotherrace "Father Other Race"
label var dadmissingrace "Father Missing Race"



***mother's education: need to harmonize <2008 with 2008+ data

gen momeduc1 = 0
replace momeduc1 = 1 if M_EDU==1|M_EDU==2
*2008+ data (up to 12th grade, but no diploma)
replace momeduc1 = 1 if M_EDU2<12
*<2008 data (11 years or less of completed education)

gen momeduc2 = 0
replace momeduc2 = 1 if M_EDU==3
*2008+ data (HS graduate or GED)
replace momeduc2 = 1 if M_EDU2==12
*<2008 data (12 years of completed education exactly)

gen momeduc3 = 0
replace momeduc3 = 1 if M_EDU==4 | M_EDU==5
*2008+ data (Some college, no degree or associate's degree)
replace momeduc3 = 1 if M_EDU2>12 & M_EDU2<16
*<2008 data (13-15 years of completed education)

gen momeduc4 = 0
replace momeduc4 = 1 if M_EDU>=6 & M_EDU<9
*2008+ data (Bachelor's degree or higher)
replace momeduc4 = 1 if M_EDU2>=16 & M_EDU2<99
*<2008 data (16+ years of completed education)

gen momeducmiss = 0
replace momeducmiss = 1 if M_EDU==9  | M_EDU2==99 
replace momeducmiss = 1 if M_EDU==. & birth_year>=2008
replace momeducmiss = 1 if M_EDU2==. & birth_year<2008
 
*check:
table birth_year, contents(mean momeduc1 mean momeduc2 mean momeduc3 mean momeduc4 mean momeducmiss)

label var momeduc1 "Mother's Education Less than HS"
label var momeduc2 "Mother's Education HS"
label var momeduc3 "Mother's Education Some College"
label var momeduc4 "Mother's Education College or More"
label var momeducmiss "Mother's Education Missing"


***father's education: need to harmonize <2008 with 2008+ data

gen dadeduc1 = 0
replace dadeduc1 = 1 if F_EDU==1|F_EDU==2
*2008+ data (up to 12th grade, but no diploma)
replace dadeduc1 = 1 if F_EDU2<12
*<2008 data (11 years or less of completed education)

gen dadeduc2 = 0
replace dadeduc2 = 1 if F_EDU==3
*2008+ data (HS graduate or GED)
replace dadeduc2 = 1 if F_EDU2==12
*<2008 data (12 years of completed education exactly)

gen dadeduc3 = 0
replace dadeduc3 = 1 if F_EDU==4 | F_EDU==5
*2008+ data (Some college, no degree or associate's degree)
replace dadeduc3 = 1 if F_EDU2>12 & F_EDU2<16
*<2008 data (13-15 years of completed education)

gen dadeduc4 = 0
replace dadeduc4 = 1 if F_EDU>=6 & F_EDU<9
*2008+ data (Bachelor's degree or higher)
replace dadeduc4 = 1 if F_EDU2>=16 & F_EDU2<99
*<2008 data (16+ years of completed education)

gen dadeducmiss = 0
replace dadeducmiss = 1 if F_EDU==9  | F_EDU2==99 
replace dadeducmiss = 1 if F_EDU==. & birth_year>=2008
replace dadeducmiss = 1 if F_EDU2==. & birth_year<2008
 
*check:
table birth_year, contents(mean dadeduc1 mean dadeduc2 mean dadeduc3 mean dadeduc4 mean dadeducmiss)

label var dadeduc1 "Father's Education Less than HS"
label var dadeduc2 "Father's Education HS"
label var dadeduc3 "Father's Education Some College"
label var dadeduc4 "Father's Education College or More"
label var dadeducmiss "Father's Education Missing"




***mother worked during pregnancy
ren workpreg mom_workpreg
replace mom_workpreg = . if mom_workpreg>1

label var mom_workpreg "Mother Worked During Pregnancy"

***total number live births
replace BALIVE_L = . if BALIVE_L==99
replace BALIVE_D = . if BALIVE_D==99

gen prevlivebirths = BALIVE_L + BALIVE_D
replace prevlivebirths = . if prevlivebirths>30
*have some observations with very high numbers

label var prevlivebirths "Number Previous Live Births"

***number prenatal visits
ren pncvisit numvisits
replace numvisits = . if numvisits==99
replace numvisits = . if numvisits>0 & fpncint==0
*** code as missing for those whose number visits don't line up with date of visit

label var numvisits "Number Prenatal Visits"

***first trimester prenatal care initiation
replace fpncint = . if fpncint>=999
replace fpncint = . if fpncint>0 & numvisits==0
*** code as missing for those whose number visits don't line up with date of visit
gen firsttri_pnc = fpncint>0 & fpncint<=90 if fpncint<. & numvisits<. & numvisits!=0

label var firsttri_pnc "1st Trimester Prenatal Care"

gen noprenatalcare = (numvisits==0 | fpncint==0) if numvisits!=. & fpncint!=.

label var noprenatalcare "No Prenatal Care"

table birth_year, contents(mean firsttri_pnc mean numvisits)


*** WIC participation
replace wic = . if wic==9

label var wic "Mother Received WIC"


*** any risk-factors for this pregnancy 
gen anyriskfactors = (MRF_NO_RISK==1) if MRF_NO_RISK<9

label var anyriskfactors "Any Risk Factors"

*** smoking: need to harmonize <2008 and 2008+ data

gen momsmoke_pre = (NUMCIG_P3M>0) if NUMCIG_P3M<99
gen momsmoke_1tri = (NUMCIG_F3M>0) if NUMCIG_F3M<99
gen momsmoke_2tri = (NUMCIG_S3M>0) if NUMCIG_S3M<99
gen momsmoke_3tri = (NUMCIG_T3M>0) if NUMCIG_T3M<99
*by trimester smoking only available in 2008+ data

label var momsmoke_pre "Mother Smoked Before Pregnancy"
label var momsmoke_1tri "Mother Smoked in 1st Trimester"
label var momsmoke_2tri "Mother Smoked in 2nd Trimester"
label var momsmoke_3tri "Mother Smoked in 3rd Trimester"

gen momsmoke = 0
replace momsmoke = 1 if momsmoke_1tri==1 | momsmoke_2tri==1 | momsmoke_3tri==1
*2008+ data
replace momsmoke = 1 if RF_TOBACCO==1
*<2008 data

replace momsmoke = . if momsmoke_1tri==. & momsmoke_2tri==. & momsmoke_3tri==. & birth_year>=2008
replace momsmoke = . if RF_TOBACCO>=9 & birth_year<2008

*check:
table birth_year, contents(mean momsmoke)

label var momsmoke "Mother Smoked During Pregnancy"

*** illicit drug use during pregnancy: need to harmonize <2008 and 2008+ data
* RF_DRU: 0-9 values mean yes, missing means no.

gen momdruguse = 0
replace momdruguse = 1 if RF_DRUG<9
*2008+ data
replace momdruguse = 1 if RF_DRUG2<=5
*<2008 data


replace momdruguse = . if RF_DRUG==9 & birth_year>=2008
replace momdruguse = . if RF_DRUG2>=9 & birth_year<2008

*check:
table birth_year, contents(mean momdruguse)

label var momdruguse "Mother Used Drugs During Pregnancy"

*** maternal weight gain during pregnancy
*let's trim the extremely high and extremely low values
replace wgtgain = . if wgtgain<0
replace wgtgain = . if wgtgain>99

label var wgtgain "Pregnancy Weight Gain (lbs)"

*** mother's BMI (available for 2008+ data only)
replace M_HGT_FT = . if M_HGT_FT==9
replace M_HGT_IN = . if M_HGT_IN==99
replace prepregwgt = . if prepregwgt==999

gen momheight = M_HGT_FT*12 + M_HGT_IN

gen mom_bmi = (prepregwgt / (momheight^2)) * 703
***missing for years <2008

label var mom_bmi "Mother's BMI"

*** any labor/delivery complications due to meconium or premature rupture of membranes
gen anycompl = (LBCOND_MEC==1 | LBCOND_RUPT==1) if LBCOND_UNK!=1

label var anycompl "Labor/Delivery Complications (meconium or rupture)"

*** any previous csections: need to harmonize <2008 and 2008+ data
gen prevcsec = 0
replace prevcsec = 1 if NUM_CSEC<77
replace prevcsec = 1 if priorcsec==1

replace prevcsec = . if NUM_CSEC>=99 & birth_year>=2008
replace prevcsec = . if priorcsec>=9 & birth_year<2008

label var prevcsec "Any Previous C-Sections"

*** c-section delivery
ren cesarean csection
replace csection = . if csection==9

label var csection "C-Section Delivery"



*** labor induction: need to harmonize <2008 and 2008+ data
*note: labor induction is in 2 variables in the 2008+ data (medical induction and artificial rupture of membranes (AROM)), so we get a lot more obs with induced labor in the 2008+ data

gen induced = 0
replace induced = 1 if INDU_MEDI==1|INDU_AROM==1
replace induced = 1 if INDU_LABOR==1

replace induced = . if INDU_MEDI>=9 & INDU_AROM>=9 & birth_year>=2008
replace induced = . if INDU_LABOR>=9 & birth_year<2008

*check:
table birth_year, contents(mean induced)

label var induced "Induction of Labor"

*** singleton vs. twin or more 
gen singleton = (NUM_DEL==1)

label var singleton "Singleton Birth"

***birth weight
replace bwgt = . if bwgt==9999
replace bwgt = . if bwgt<100
ren bwgt birthweight

label var birthweight "Birth Weight (g)"

*** Apgar scores
ren APGAR1 apgar1
replace apgar1 = . if apgar1>10

ren APGAR5 apgar5
replace apgar5 = . if apgar5>10

label var apgar1 "APGAR 1-min"
label var apgar5 "APGAR 5-min"

*** any abnormal conditions of newborn
gen any_abnormal = ABN_NON==0 if ABN_UNK!=1

label var any_abnormal "Any Abnormal Conditions of Newborn"

*** NICU
ren neocare nicu
replace nicu = . if nicu>=9

label var nicu "NICU Use"

*** Infant transferred: need to harmonize <2008 and 2008+ data
gen inf_trans = 0
replace inf_trans = 1 if TRANSFER==1
replace inf_trans = 1 if TRANSFER2==1
replace inf_trans = . if TRANSFER>=9 & birth_year>=2008
replace inf_trans = . if TRANSFER2>=9 & birth_year<2008

*check:
table birth_year, contents(mean inf_trans)

label var inf_trans "Infant Transferred"

*** Any congenital anomalies
gen any_congen = CGAN_NON==0 if CGAN_UNK!=1

label var any_congen "Any Congenital Anomalies of Newborn"

*** Infant died
gen child_death = INFT_LIVE==0 if INFT_LIVE<9

label var child_death "Child Died"


*** Depression during pregnancy (2008+ data only)
gen mom_depressed = DEPRESS_PREG>1 if DEPRESS_PREG<9

gen mom_depressed_nohelp = DEPRESS_PREG==4 if DEPRESS_PREG<9

tab mom_depressed
tab mom_depressed_nohelp

label var mom_depressed "Mother Depressed During Pregnancy"
label var mom_depressed_nohelp "Mother Very Depressed, No Help"


***** Keep only newly created variables
***** can edit this list to keep more/fewer variables

#delimit ;
global varlist Study_ID birth_year birth_month M_FNAME_ANON M_MNAME_ANON M_LNAME_ANON 
M_FNAME_SDX_ID M_MNAME_SDX_ID M_LNAME_SDX_ID M_DOBM_ANON M_DOBDD_ANON M_DOBYY_ANON M_SSN_ID M_SSN_Last4_ID
source_years gest_months gest_weeks birth_ym concep_ym concep_year concep_month
cmale birth_hosp momage momage1 momage2 momage3 momage4 momagemiss mom_married mom_foreign
mom_borough dadage dadage1 dadage2 dadage3 dadage4 dadagemiss dad_foreign
momhispanic momnonhispanicwhite momnonhispanicblack momasian momotherrace mommissingrace
dadhispanic dadnonhispanicwhite dadnonhispanicblack dadasian dadotherrace dadmissingrace
momeduc1 momeduc2 momeduc3 momeduc4 momeducmiss
dadeduc1 dadeduc2 dadeduc3 dadeduc4 dadeducmiss
mom_workpreg prevlivebirths numvisits firsttri_pnc noprenatalcare wic 
anyriskfactors momsmoke_pre momsmoke_1tri momsmoke_2tri momsmoke_3tri momsmoke
momdruguse wgtgain mom_bmi anycompl prevcsec csection induced singleton
birthweight apgar1 apgar5 any_abnormal nicu inf_trans any_congen child_death
 mom_depressed mom_depressed_nohelp;
#delimit cr

keep $varlist

save ${bulk}birth_outcomes_CLEAN.dta, replace





